package com.zhangdi.demo.text;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Pattern;

/**
 * @author zhangdi
 * @version 1.0
 * @date 2023.4.23
 */
public class JaccardSimilarity {
    private static final Pattern PATTERN = Pattern.compile("[^\u4e00-\u9fa5a-zA-Z]");
    // 计算Jaccard相似度
    public static double calculate(Set<String> setA, Set<String> setB) {
        Set<String> intersection = new HashSet<>(setA);
        intersection.retainAll(setB);

        Set<String> union = new HashSet<>(setA);
        union.addAll(setB);

        if (union.isEmpty()) {
            return 0;
        }

        return (double) intersection.size() / union.size();
    }

    // 测试
    public static void main(String[] args) {
        String s1 = PATTERN.matcher("莘县河店镇贾庄村美丽乡村三期规划").replaceAll("");
        String s2 = PATTERN.matcher("莘县河店镇贾庄村美丽乡村三期规划").replaceAll("");;

        Set<String> set1 = new HashSet<>(Arrays.asList(s1.split("")));
        Set<String> set2 = new HashSet<>(Arrays.asList(s2.split("")));

        double similarity = calculate(set1, set2);
        System.out.println("Jaccard相似度：" + similarity);
    }
}
